/*    Copyright 2010 Tobias Marschall
 *
 *    This file is part of MoSDi.
 *
 *    MoSDi is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    MoSDi is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with MoSDi.  If not, see <http://www.gnu.org/licenses/>.
 */

package mosdi.subcommands;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.TreeSet;

import mosdi.discovery.EvaluatedPattern;
import mosdi.discovery.MotifFinder;
import mosdi.discovery.ObjectiveFunction;
import mosdi.discovery.objectives.AnnotationSumObjective;
import mosdi.discovery.objectives.OccurrenceCountObjective;
import mosdi.discovery.objectives.SequenceCountObjective;
import mosdi.discovery.strategies.BruteForceSearch;
import mosdi.discovery.strategies.OptimumSearch;
import mosdi.discovery.strategies.SingleObjectiveOptimumSearch;
import mosdi.discovery.strategies.ThresholdSearch;
import mosdi.fa.Alphabet;
import mosdi.fa.FiniteMemoryTextModel;
import mosdi.fa.IIDTextModel;
import mosdi.fa.MarkovianTextModel;
import mosdi.index.ScoreIndexCalculator;
import mosdi.index.SuffixTree;
import mosdi.index.SuffixTreeWalker;
import mosdi.paa.IndexSumDAA;
import mosdi.paa.PAA;
import mosdi.paa.TextBasedPAA;
import mosdi.util.BitArray;
import mosdi.util.Iupac;
import mosdi.util.IupacStringConstraints;
import mosdi.util.Log;
import mosdi.util.NamedSequence;
import mosdi.util.SequenceUtils;
import mosdi.util.iterators.IupacPatternIterator;

public class IupacDiscoverySubcommand extends Subcommand {
	
	@Override
	public String usage() {
		return
		super.usage()+" [options] <pattern-length> <objectives> <fasta-file>\n" +
		"\n" +
		" General options\n" +
		"  -i: replace unknown characters in DNA sequences by $\n" +
		"  -t <timelimit>: abort after the given amount of wallclock time (in seconds) has elapsed.\n" +
		"                  Then, give the next pattern that must be examined in order to resume search.\n" +
		"  -N <factor>: If timelimit has been hit, suggest a set of motif ranges that represent workloads\n" +
		"               that are <factor> times as big as the one finished until timelimit was hit.\n" +
		"\n" +
		" Controlling search STRATEGY:\n" +
		"  -T <pvalue-threshold>: Search for all motifs <= threshold (default: optimal motif only)\n" +
		"  -a: AND mode for option -T. All objectives (default: one objective) must be below threshold.\n" +
		"  -B: Perform brute force search: evaluate all motifs\n" +
		"  -I <initial-pvalue-threshold>: Only report optimal motif(s) but limit search to motifs with\n" +
		"                                 p-value better than given threshold\n" +
		"\n" +
		" Controlling MOTIF SPACE to be searched:\n" +
		"  -m: minimum number of letters from {A,C,G,T},{W,S,R,Y,K,M},{B,D,H,V},{N}, default: \"0,0,0,0\"\n" +
		"  -M: maximum number of letters from {A,C,G,T},{W,S,R,Y,K,M},{B,D,H,V},{N}, default: \"<length>,0,0,<length>/2\"\n" +
		"  -l <lower-bound>: only consider patterns s.t. lower-bound<=pattern\n" +
		"  -u <upper-bound>: only consider patterns s.t. pattern<upper-bound\n" +
		"  -E <max-expectation>: upper bound for expectation (motifs with higher expectation\n" +
		"                       are not returned.)\n"+ 
		"  -r: consider reverse complement\n" +
		"\n" +
		" Allowed OBJECTIVES to be given as <objectives> (comma separated list):\n" +
		"   \"occ-count\": total number of occurrences\n" +
		"   \"seq-count\": number of sequences with at least one occurrence\n" +
		"   \"annotation\": (EXPERIMENTAL) sum of annotations at motif positions\n" +
		"\n" +
		" Options influencing objective \"occ-count\"\n" +
//		"  -c <exp-clump-size-bound>: do not calculate upper bound for expected clump size dymanically,\n" +
//		"                             but use given threshold instead.\n" +
		"  -C <good-enough-bound>: when clump size bounds are computed dynamically, no new calculations\n" +
		"                          are made when a bound <=good_enough_bound is known (default: 1.01)\n"+
		"  -O <model-order>: use background model of given order (default: 0)\n" +
		"  -s <pseudo-counts>: pseudocounts to be used for model estimation (default: 0.0)\n" +
		"  -q <q-gram-table-file>: Estimate text model from given q-gram table (default: uniform i.i.d.)\n" +
		"                          A q-gram table can be created by using \"mosdi-utils count-qgrams\".\n" +
		"  -d: use detailed bounds on conditional probabilities (takes longer to precompute,\n" +
		"      but yields better bounds)\n" +
		"\n"+
		" Options influencing objective \"seq-count\"\n" +
		"  -C <good-enough-bound>: when clump size bounds are computed dynamically, no new calculations\n" +
		"                          are made when a bound <=good_enough_bound is known (default: 1.01)\n"+
		"  -O <model-order>: use background model of given order (default: 0)\n" +
		"  -s <pseudo-counts>: pseudocounts to be used for model estimation (default: 0.0)\n" +
		"  -q <q-gram-table-file>: Estimate text model from given q-gram table (default: uniform i.i.d.)\n" +
		"                          A q-gram table can be created by using \"mosdi-utils count-qgrams\".\n" +
		"  -f <min-fraction>: lower bound for the fraction of sequences the motif must occur in,\n" +
		"                     a value of 0.5 means that the motif must occur in at least half of all\n" +
		"                     sequences (default: 0.0).\n" +
		"\n" +
		" Options influencing objective \"annotation\"\n" +
		"  -A <annotation.fa>: file containing an annotation track (mandatory)\n" +
		"  -b <bincount>: Number of bins for discretization of annotation scores (default: 11).\n" +
		"  -p <pseudocounts>: Pseudocounts for estimation of annotation model (default: 0.1).\n" +
		"  -R <order>: Order of annotation model (default: 2).\n" +
		"  -n: maximum number of occurrences, patterns that occur more often are not found.\n" +
		"      (default=1000)";
	}
	
	@Override
	public String description() {
		return "Runs motif discovery on (constrained) IUPAC patterns."; 
	}

	@Override
	public String name() {
		return "discovery";
	}

	@Override
	public int run(String[] args) {
		parseOptions(args, 3, "m:M:ac:BT:O:l:u:rE:C:iA:b:R:p:n:adq:t:I:f:s:N:");

		// Option dependencies
		impliedOptions("a", "T");
		impliedOptions("s", "O");
		impliedOptions("N", "t");
		exclusiveOptions("s", "q");
		exclusiveOptions("B", "T");
		exclusiveOptions("O", "q");
		exclusiveOptions("I", "T");
		exclusiveOptions("I", "B");
		
		// Mandatory arguments
		int patternLength = getIntArgument(0);
		String objectivesArgument = getStringArgument(1);
		String filename = getStringArgument(2);

		// Options
		final Alphabet iupacAlphabet = Alphabet.getIupacAlphabet();
		final Alphabet dnaAlphabet = Alphabet.getDnaAlphabet();
		boolean considerReverse = getBooleanOption("r", false);
		int[] minFrequencies = {0,0,0,0};
		minFrequencies = getRangedIntTupleOption("m", 4, 0, patternLength, minFrequencies);
		int[] maxFrequencies = {patternLength,0,0,patternLength/2};
		maxFrequencies = getRangedIntTupleOption("M", 4, 0, patternLength, maxFrequencies);
		double pValueThreshold = getRangedDoubleOption("T", 0.0, 1.0, -1.0);
		double initialPValueThreshold = getRangedDoubleOption("I", 0.0, 1.0, 1.0);
		boolean pValueThresholdAndMode = getBooleanOption("a", false);
		boolean bruteForce = getBooleanOption("B", false);
		int modelOrder = getNonNegativeIntOption("O", 0);
		double pseudoCounts = getRangedDoubleOption("s", 0.0, Double.MAX_VALUE, 0.0);
		int[] lowerBound = getStringOption("l", iupacAlphabet, null);
		int[] upperBound = getStringOption("u", iupacAlphabet, null);
		double maxExpectation = getRangedDoubleOption("E", 0.0, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY);
		double goodEnoughClumpSizeBound = getRangedDoubleOption("C", 1.0, Double.POSITIVE_INFINITY, 1.01);
		boolean replaceUnknownByMinusOne = getBooleanOption("i", false);
		String annotationFilename = getStringOption("A", null);
		int annotationBinCount = getPositiveIntOption("b", 11);
		int annotationModelOrder = getNonNegativeIntOption("R", 2);
		double annotationPseudoCounts = getDoubleOption("p", 0.1);
		int maxOccurrences = getNonNegativeIntOption("n", 1000);
		boolean useDetailedBounds = getBooleanOption("d", false);
		String qGramTableFilename = getStringOption("q", null);
		int timeLimitParam = getPositiveIntOption("t", 0);
		double minSequenceCount = getRangedDoubleOption("f", 0.0, 1.0, 0.0);
		double newJobsWorkloadFactor = getPositiveDoubleOption("N", -1.0);

		Timelimit timelimit = null;
		if (timeLimitParam>0) timelimit = new Timelimit(timeLimitParam);
		
		Set<String> objectiveNames = null;
		List<NamedSequence> namedSequences = null;
		StringTokenizer st = new StringTokenizer(objectivesArgument, ",",false);
		objectiveNames = new TreeSet<String>();
		while (st.hasMoreTokens()) {
			String token = st.nextToken();
			if (token.equals("occ-count") || token.equals("seq-count") || token.equals("annotation")) {
				objectiveNames.add(token);
			} else {
				Log.errorln("Unknown objective: "+token);
				System.exit(1);
			}
		}
		if (objectiveNames.size()==0) {
			Log.errorln("No objective function specified!");
			System.exit(1);
		}
		if (objectiveNames.contains("annotation") && (annotationFilename==null)) {
			Log.errorln("Option -A is mandatory if \"annotation\" objective is used.");
			System.exit(1);
		}
		Log.startTimer();
		try{
			namedSequences = SequenceUtils.readFastaFile(filename, dnaAlphabet, replaceUnknownByMinusOne);
		} catch (Exception e) {
			Log.errorln(e.toString());
			System.exit(1);
		}
		Log.stopTimer("Reading FASTA file");
		int effectiveLength = 0;
		for (NamedSequence ns : namedSequences) effectiveLength+=ns.length()-patternLength+1;
		int[] sequenceLengths = null;
		if (objectiveNames.contains("seq-count")) {
			sequenceLengths = new int[namedSequences.size()];
			for (int i=0; i<namedSequences.size(); ++i) {
				sequenceLengths[i] = namedSequences.get(i).length();
			}
		}
		Log.startTimer();
		List<int[]> sequences = new ArrayList<int[]>(namedSequences.size());
		for (NamedSequence ns : namedSequences) sequences.add(ns.getSequence());
		FiniteMemoryTextModel textModel = null;
		if (qGramTableFilename==null) {
			textModel = (modelOrder==0)?new IIDTextModel(dnaAlphabet.size(), sequences):new MarkovianTextModel(modelOrder,dnaAlphabet.size(),sequences,pseudoCounts);
		} else {
			try {
				textModel = SequenceUtils.buildTextModelFromQGramFile(qGramTableFilename);
			} catch (Exception e) {
				Log.errorln(e.toString());
				System.exit(1);
			}
		}
		Log.stopTimer("Estimating text model");
		
		SuffixTree suffixTree = new SuffixTree();
		// variables needed to optimize annotation sum
		int[] scoreMaximumAnnotation = null;
		int[] scoreSumAnnotation = null;
		double[] scoreDistribution = null;
		FiniteMemoryTextModel annotationModel = null;
		ScoreIndexCalculator scoreIndexCalculator = null;
		if (objectiveNames.contains("annotation")) {
			Log.startTimer();
			SequenceUtils.readAnnotationTrack("annotation0", annotationFilename, namedSequences, annotationBinCount);
			Log.stopTimer("Reading annotation track");
			int[] concatAnnotation = SequenceUtils.concatAnnotations(namedSequences, "annotation0", considerReverse);
			scoreIndexCalculator = new ScoreIndexCalculator(concatAnnotation, patternLength);
			suffixTree.addNodeConstructionListener(scoreIndexCalculator);
			List<int[]> annotations = new ArrayList<int[]>(namedSequences.size());
			for (NamedSequence ns : namedSequences)	annotations.add(ns.getAnnotationTrack("annotation0"));
			if (annotationModelOrder==0) {
				annotationModel = new IIDTextModel(annotationBinCount, annotations);
			} else {
				annotationModel = new MarkovianTextModel(annotationModelOrder, annotationBinCount, annotations, annotationPseudoCounts);
			}
			Log.startTimer();
			IndexSumDAA daa = new IndexSumDAA(annotationModel.getAlphabetSize(), patternLength*(annotationBinCount-1));
			PAA paa = new TextBasedPAA(daa, annotationModel);
			scoreDistribution = paa.computeValueDistribution(patternLength);
			Log.stopTimer("Computing distribution of annotation sums");
		}
		// allow sequences to be garbage collected
		sequences = null;
		int[] concatSequence = SequenceUtils.concatSequences(namedSequences, considerReverse);
		suffixTree.buildTree(concatSequence, dnaAlphabet.size(), patternLength);
		concatSequence = null;
		if (objectiveNames.contains("annotation")) {
			scoreMaximumAnnotation = scoreIndexCalculator.getScoreMaximumAnnotation();
			scoreSumAnnotation = scoreIndexCalculator.getScoreSumAnnotation();
		}
		if (Log.levelAtLeast(Log.Level.DEBUG)) {
			SuffixTreeWalker walker = suffixTree.getWalker();
			BitArray wildcard = new BitArray(dnaAlphabet.size());
			wildcard.invert();
			for (int i=1; i<=patternLength; ++i) {
				int[] nodes = walker.forward(wildcard);
				Log.printf(Log.Level.DEBUG, "Nodes for prefix length %d: %,d (memory: %,d bytes)\n",i,nodes.length,nodes.length*suffixTree.getNodeSize());
			}
		}
		int[] occurrenceCountAnnotation = null;
		if (objectiveNames.contains("occ-count") || objectiveNames.contains("annotation")) {
			occurrenceCountAnnotation = suffixTree.calcOccurrenceCountAnnotation();
		}
		BitArray[] sequenceOccurrenceAnnotation = null;
		if (objectiveNames.contains("seq-count")) {
			List<int[]> l = SequenceUtils.appendMinusOnes(namedSequences, considerReverse);
			sequenceOccurrenceAnnotation = suffixTree.calcSequenceOccurrenceAnnotation(l);
		}
		// allow garbage collection
		namedSequences = null;

		Log.startTimer();
//		FactorialCalculator fc = new FactorialCalculator(10000);
		BitArray[] generalizedAlphabet = Iupac.toGeneralizedString("ABCDGHKMNRSTVWY").getPositions();
		MotifFinder motifFinder = new MotifFinder(suffixTree, generalizedAlphabet, considerReverse);
		motifFinder.setAbortRequester(timelimit);
		
		// IidPValueSearch search = new IidPValueSearch(sequences, textModel.getCharacterDistribution(), pValueThreshold, occurrenceCountAnnotation, false, expectedClumpSizeBound);
		int[] maxMatchCountAnnotation = suffixTree.calcMaxMatchCountAnnotation(patternLength);
		
		List<ObjectiveFunction> objectives = new ArrayList<ObjectiveFunction>();
		for (String name : objectiveNames) {
			if (name.equals("occ-count")) {
				OccurrenceCountObjective o = new OccurrenceCountObjective(textModel, occurrenceCountAnnotation, maxMatchCountAnnotation, effectiveLength, useDetailedBounds);
				o.setGoodEnoughClumpSizeBound(goodEnoughClumpSizeBound);
				if (!Double.isInfinite(maxExpectation)) o.setMaxExpectation(maxExpectation);
				objectives.add(o);
			}
			if (name.equals("seq-count")) {
				SequenceCountObjective o = new SequenceCountObjective(textModel, sequenceLengths, sequenceOccurrenceAnnotation, useDetailedBounds);
				o.setGoodEnoughClumpSizeBound(goodEnoughClumpSizeBound);
				o.setMinSequenceCount((int)Math.ceil(minSequenceCount*sequenceLengths.length));
				objectives.add(o);
			}
			if (name.equals("annotation")) {
				AnnotationSumObjective o = new AnnotationSumObjective(maxOccurrences, scoreDistribution, occurrenceCountAnnotation, scoreMaximumAnnotation, scoreSumAnnotation);
				objectives.add(o);
			}
		}
		
		MotifFinder.SearchSpecification search = null;
		if (bruteForce) {
			if (Log.levelAtLeast(Log.Level.DEBUG)) {
				search = new BruteForceSearch(objectives, iupacAlphabet, System.out);
			} else {
				search = new BruteForceSearch(objectives, iupacAlphabet, null);
			}
		} else if (pValueThreshold!=-1.0) {
			search = new ThresholdSearch(objectives,  pValueThreshold, System.out, pValueThresholdAndMode);
		} else {
			if (objectives.size()==1) {
				search = new SingleObjectiveOptimumSearch(objectives.get(0), initialPValueThreshold);	
			} else {
				search = new OptimumSearch(objectives);	
			}
		}
		Log.startTimer();
		motifFinder.findIupacPatterns(patternLength, new IupacStringConstraints(minFrequencies,maxFrequencies), search, lowerBound, upperBound);
		Log.stopTimer("Search for IUPAC pattern on suffix tree");
		double timeMotifDiscovery = Log.getLastPeriodCpu();
		Log.print(Log.Level.VERBOSE, "------------------- RESULTS -------------------\n");
		Log.println(Log.Level.STANDARD, "Format: >> pattern >objective> score minus-log-pvalue pvalue");
		for (EvaluatedPattern m : search.getResults()) {
			// StringBuilder sb = new StringBuilder();
			// sb.append(String.format(">>p_value>> %e LIN ", m.getPValue()));
			// sb.append(String.format(">>stats>> %s %d ", (m.getPattern()==null)?"null":iupacAlphabet.buildString(m.getPattern()), m.getScore()));
			Log.println(Log.Level.STANDARD, m.toString(iupacAlphabet, objectives));
		}
		//Log.printf(Log.Level.STANDARD, ">>> %s %d %d %t %e%n", search.getMotifsEvaluated(), (long)Math.round(Math.exp(logNumber)), timeInstanceSearch, bestPValue);
		Log.printf(Log.Level.VERBOSE, ">>!>total_time>>: %t %n", timeMotifDiscovery);
		if ((timelimit!=null) && timelimit.timelimitHit()) {
			Log.printf(Log.Level.STANDARD, ">>TIMELIMIT-HIT>> %s\n", iupacAlphabet.buildString(timelimit.getNextPattern()));
			if (newJobsWorkloadFactor>0.0) {
				IupacStringConstraints c = new IupacStringConstraints(minFrequencies,maxFrequencies);
				IupacPatternIterator iterator = new IupacPatternIterator(patternLength, c);
				if (lowerBound!=null) iterator.skipByLowerBound(lowerBound);
				int[] firstPattern = iterator.next();
				iterator.skipByLowerBound(timelimit.getNextPattern());
				long finishedRange = Iupac.rank(timelimit.getNextPattern(), c) - Iupac.rank(firstPattern, c);
				long increment = (long)(finishedRange * newJobsWorkloadFactor);
				int[] last = iterator.next();
				while (iterator.hasNext()) {
					iterator.skipByNumber(increment);
					if (iterator.hasNext()) {
						int[] current = iterator.next();
						if (arrayIsSmaller(current, upperBound)) {
							Log.printf(Log.Level.STANDARD, ">>R> %s %s\n", iupacAlphabet.buildString(last), iupacAlphabet.buildString(current));
							last = current;
							continue;
						} 
					}
					if (upperBound==null) {
						int[] a = new int[patternLength+1];
						Arrays.fill(a, iupacAlphabet.size()-1);
						Log.printf(Log.Level.STANDARD, ">>R> %s %s\n", iupacAlphabet.buildString(last), iupacAlphabet.buildString(a));
					} else {
						Log.printf(Log.Level.STANDARD, ">>R> %s %s\n", iupacAlphabet.buildString(last), iupacAlphabet.buildString(upperBound));	
					}
					break;
				}
			}
		}
		return 0;
	}

	/** Returns true if a is smaller than b. */
	private static boolean arrayIsSmaller(int[] a, int[] b) {
		if (b==null) return true;
		for (int i=0; i<Math.min(a.length, b.length); ++i) {
			if (a[i]<b[i]) return true;
			if (a[i]>b[i]) return false;
		}
		return a.length<b.length;
	}
	
	private static class Timelimit implements MotifFinder.AbortRequester {
		private long quitTime;
		private int[] nextPattern;
		private boolean timelimitHit;
		
		public Timelimit(int seconds) {
			this.quitTime = System.currentTimeMillis() + ((long)seconds)*1000;
			this.nextPattern = null;
			this.timelimitHit = false;
		}
		
		@Override
		public boolean abortNow(int[] nextPattern) {
			if (System.currentTimeMillis()>=quitTime) {
				this.nextPattern = nextPattern;
				this.timelimitHit = true;
				return true;
			}
			return false;
		}

		public int[] getNextPattern() {
			return nextPattern;
		}

		public boolean timelimitHit() {
			return timelimitHit;
		} 
		
	}
	
}
